Enable int8 support for quantized_linear and quantized_relu reference (#6334)

mcremon-meta · facebook-github-bot · commit 98a1012195fb · 2024-10-27T21:11:04.000-07:00
Summary:

As titled.

Reviewed By: zonglinpeng

Differential Revision: D64553726
diff --git a/backends/cadence/aot/ops_registrations.py b/backends/cadence/aot/ops_registrations.py
@@ -185,7 +185,7 @@ def quantized_relu_meta(
     out_multiplier: torch.Tensor,
     out_shift: torch.Tensor,
 ) -> torch.Tensor:
-    return X.new_empty(X.size(), dtype=torch.uint8)
+    return X.new_empty(X.size(), dtype=X.dtype)
 
 
 @register_fake("cadence::quantized_matmul")
diff --git a/backends/cadence/hifi/operators/dequantize_per_tensor.cpp b/backends/cadence/hifi/operators/dequantize_per_tensor.cpp
@@ -44,7 +44,10 @@ void dequantize_per_tensor_out(
     impl::HiFi::kernels::dequantize<int32_t>(
         out_data, input_data, scale, zero_point, numel);
   } else {
-    ET_CHECK_MSG(false, "Unhandled input dtype %hhd", input.scalar_type());
+    ET_CHECK_MSG(
+        false,
+        "Unhandled input dtype %hhd",
+        static_cast<int8_t>(input.scalar_type()));
   }
 }
 
diff --git a/backends/cadence/hifi/operators/quantize_per_tensor.cpp b/backends/cadence/hifi/operators/quantize_per_tensor.cpp
@@ -50,8 +50,10 @@ void quantize_per_tensor_out(
     cadence::impl::HiFi::kernels::quantize<int32_t>(
         out_data, input_data, 1. / scale, zero_point, numel);
   } else {
-    ET_CHECK_MSG(false, "Unhandled input dtype %hhd", out.scalar_type());
-  }
+    ET_CHECK_MSG(
+        false,
+        "Unhandled output dtype %hhd",
+        static_cast<int8_t>(out.scalar_type()));  }
 }
 
 }; // namespace native
diff --git a/backends/cadence/hifi/operators/quantized_layer_norm.cpp b/backends/cadence/hifi/operators/quantized_layer_norm.cpp
@@ -151,7 +151,10 @@ void quantized_layer_norm_out(
         output_zero_point,
         out);
   } else {
-    ET_CHECK_MSG(false, "Unhandled input dtype %hhd", input.scalar_type());
+    ET_CHECK_MSG(
+        false,
+        "Unhandled input dtype %hhd",
+        static_cast<int8_t>(input.scalar_type()));
   }
 }
 
diff --git a/backends/cadence/reference/operators/quantized_conv_out.cpp b/backends/cadence/reference/operators/quantized_conv_out.cpp
@@ -248,6 +248,11 @@ void quantized_conv_out(
         output_scale,
         (int8_t)output_zero_point,
         per_tensor_quantized);
+  } else {
+    ET_CHECK_MSG(
+        false,
+        "Unhandled input dtype %hhd",
+        static_cast<int8_t>(input.scalar_type()));
   }
 }
 
diff --git a/backends/cadence/reference/operators/quantized_linear_out.cpp b/backends/cadence/reference/operators/quantized_linear_out.cpp
@@ -17,8 +17,8 @@ using executorch::aten::Tensor;
 using executorch::runtime::getLeadingDims;
 using executorch::runtime::KernelRuntimeContext;
 
-void quantized_linear_out(
-    KernelRuntimeContext& ctx,
+template <typename T>
+void inline _typed_quantized_linear(
     const Tensor& src,
     const Tensor& weight,
     const Tensor& bias,
@@ -27,14 +27,11 @@ void quantized_linear_out(
     const Tensor& out_multiplier,
     const Tensor& out_shift,
     int64_t out_zero_point,
-    const executorch::aten::optional<Tensor>& offset,
     Tensor& out) {
-  // Assuming uint8_t for now, but needs to be updated for other quantization
-  // types
-  const uint8_t* __restrict__ src_data = src.const_data_ptr<uint8_t>();
-  const uint8_t* __restrict__ weight_data = weight.const_data_ptr<uint8_t>();
+  const T* __restrict__ src_data = src.const_data_ptr<T>();
+  const T* __restrict__ weight_data = weight.const_data_ptr<T>();
   const int32_t* __restrict__ bias_data = bias.const_data_ptr<int32_t>();
-  uint8_t* __restrict__ out_data = out.mutable_data_ptr<uint8_t>();
+  T* __restrict__ out_data = out.mutable_data_ptr<T>();
 
   int32_t weight_zero_point = weight_zero_point_t.const_data_ptr<int32_t>()[0];
 
@@ -71,11 +68,53 @@ void quantized_linear_out(
             (weight_data[j * N + k] - weight_zero_point);
       }
       out_data[i * M + j] =
-          kernels::quantize<uint8_t>(sum, out_scale, out_zero_point);
+          kernels::quantize<T>(sum, out_scale, out_zero_point);
     }
   }
 }
 
+void quantized_linear_out(
+    KernelRuntimeContext& ctx,
+    const Tensor& src,
+    const Tensor& weight,
+    const Tensor& bias,
+    int64_t src_zero_point,
+    const Tensor& weight_zero_point_t,
+    const Tensor& out_multiplier,
+    const Tensor& out_shift,
+    int64_t out_zero_point,
+    const executorch::aten::optional<Tensor>& offset,
+    Tensor& out) {
+  if (out.scalar_type() == executorch::aten::ScalarType::Byte) {
+    _typed_quantized_linear<uint8_t>(
+        src,
+        weight,
+        bias,
+        src_zero_point,
+        weight_zero_point_t,
+        out_multiplier,
+        out_shift,
+        out_zero_point,
+        out);
+  } else if (out.scalar_type() == executorch::aten::ScalarType::Char) {
+    _typed_quantized_linear<int8_t>(
+        src,
+        weight,
+        bias,
+        src_zero_point,
+        weight_zero_point_t,
+        out_multiplier,
+        out_shift,
+        out_zero_point,
+        out);
+  } else {
+    ET_CHECK_MSG(
+        false,
+        "Unhandled input dtype %hhd",
+        static_cast<int8_t>(src.scalar_type()));
+  }
+}
+
 }; // namespace native
 }; // namespace reference
 }; // namespace impl
diff --git a/backends/cadence/reference/operators/quantized_matmul_out.cpp b/backends/cadence/reference/operators/quantized_matmul_out.cpp
@@ -144,6 +144,11 @@ void quantized_matmul_out(
         out_zero_point,
         transposed,
         out);
+  } else {
+    ET_CHECK_MSG(
+        false,
+        "Unhandled input dtype %hhd",
+        static_cast<int8_t>(X.scalar_type()));
   }
 }
 

Original file line number	Diff line number	Diff line change
`@@ -44,7 +44,10 @@ void dequantize_per_tensor_out(`
`44`	`44`	`impl::HiFi::kernels::dequantize<int32_t>(`
`45`	`45`	`out_data, input_data, scale, zero_point, numel);`
`46`	`46`	`} else {`
`47`		`- ET_CHECK_MSG(false, "Unhandled input dtype %hhd", input.scalar_type());`
	`47`	`+ ET_CHECK_MSG(`
	`48`	`+ false,`
	`49`	`+ "Unhandled input dtype %hhd",`
	`50`	`+ static_cast<int8_t>(input.scalar_type()));`
`48`	`51`	`}`
`49`	`52`	`}`
`50`	`53`
Original file line number	Diff line number	Diff line change
`@@ -151,7 +151,10 @@ void quantized_layer_norm_out(`
`151`	`151`	`output_zero_point,`
`152`	`152`	`out);`
`153`	`153`	`} else {`
`154`		`- ET_CHECK_MSG(false, "Unhandled input dtype %hhd", input.scalar_type());`
	`154`	`+ ET_CHECK_MSG(`
	`155`	`+ false,`
	`156`	`+ "Unhandled input dtype %hhd",`
	`157`	`+ static_cast<int8_t>(input.scalar_type()));`
`155`	`158`	`}`
`156`	`159`	`}`
`157`	`160`
Original file line number	Diff line number	Diff line change
`@@ -248,6 +248,11 @@ void quantized_conv_out(`
`248`	`248`	`output_scale,`
`249`	`249`	`(int8_t)output_zero_point,`
`250`	`250`	`per_tensor_quantized);`
	`251`	`+ } else {`
	`252`	`+ ET_CHECK_MSG(`
	`253`	`+ false,`
	`254`	`+ "Unhandled input dtype %hhd",`
	`255`	`+ static_cast<int8_t>(input.scalar_type()));`
`251`	`256`	`}`
`252`	`257`	`}`
`253`	`258`
Original file line number	Diff line number	Diff line change
`@@ -144,6 +144,11 @@ void quantized_matmul_out(`
`144`	`144`	`out_zero_point,`
`145`	`145`	`transposed,`
`146`	`146`	`out);`
	`147`	`+ } else {`
	`148`	`+ ET_CHECK_MSG(`
	`149`	`+ false,`
	`150`	`+ "Unhandled input dtype %hhd",`
	`151`	`+ static_cast<int8_t>(X.scalar_type()));`
`147`	`152`	`}`
`148`	`153`	`}`
`149`	`154`