Fix missing things from OSS kernels updates

mcremon-meta · facebook-github-bot · commit 918c6abbd503 · 2025-09-08T08:58:49.000-07:00
Summary: Add a few missing pieces from a previous change

Differential Revision: D81863822
diff --git a/backends/cadence/hifi/operators/op_quantized_add_asym8sxasym8s_asym8s_per_tensor_out.cpp b/backends/cadence/hifi/operators/op_quantized_add_asym8sxasym8s_asym8s_per_tensor_out.cpp
@@ -16,6 +16,8 @@ namespace native {
 
 using ::executorch::aten::Tensor;
 using ::executorch::runtime::KernelRuntimeContext;
+using ::impl::reference::kernels::dequantize;
+using ::impl::reference::kernels::quantize;
 
 void quantized_add_asym8sxasym8s_asym8s_per_tensor_out(
     KernelRuntimeContext& ctx,
@@ -62,24 +64,24 @@ void quantized_add_asym8sxasym8s_asym8s_per_tensor_out(
   } /* if Y is a scalar Tensor */
   else if (Y_numel == 1) {
     float y =
-        kernels::dequantize<int8_t>(Y_data[0], Y_scale_f, Y_zero_point_i32);
+        dequantize<int8_t>(Y_data[0], Y_scale_f, Y_zero_point_i32);
     for (size_t i = 0; i < X_numel; ++i) {
       float x =
-          kernels::dequantize<int8_t>(X_data[i], X_scale_f, X_zero_point_i32);
+          dequantize<int8_t>(X_data[i], X_scale_f, X_zero_point_i32);
       float z = x + y;
       out_data[i] =
-          kernels::quantize<int8_t>(z, inv_out_scale, out_zero_point_i32);
+          quantize<int8_t>(z, inv_out_scale, out_zero_point_i32);
     }
   } /* if X is a scalar Tensor */
   else if (X_numel == 1) {
     float x =
-        kernels::dequantize<int8_t>(X_data[0], X_scale_f, X_zero_point_i32);
+        dequantize<int8_t>(X_data[0], X_scale_f, X_zero_point_i32);
     for (size_t i = 0; i < Y_numel; ++i) {
       float y =
-          kernels::dequantize<int8_t>(Y_data[i], Y_scale_f, Y_zero_point_i32);
+          dequantize<int8_t>(Y_data[i], Y_scale_f, Y_zero_point_i32);
       float z = x + y;
       out_data[i] =
-          kernels::quantize<int8_t>(z, inv_out_scale, out_zero_point_i32);
+          quantize<int8_t>(z, inv_out_scale, out_zero_point_i32);
     }
   } /* other broadcasting cases */
   else {
@@ -162,13 +164,13 @@ void quantized_add_asym8sxasym8s_asym8s_per_tensor_out(
       }
 
       /* Apply the operation */
-      float x = kernels::dequantize<int8_t>(
+      float x = dequantize<int8_t>(
           X_data[X_idx], X_scale_f, X_zero_point_i32);
-      float y = kernels::dequantize<int8_t>(
+      float y = dequantize<int8_t>(
           Y_data[Y_idx], Y_scale_f, Y_zero_point_i32);
       float z = x + y;
       out_data[i] =
-          kernels::quantize<int8_t>(z, inv_out_scale, out_zero_point_i32);
+          quantize<int8_t>(z, inv_out_scale, out_zero_point_i32);
     }
   }
 }
diff --git a/backends/cadence/hifi/operators/op_quantized_add_asym8uxasym8u_asym8u_per_tensor_out.cpp b/backends/cadence/hifi/operators/op_quantized_add_asym8uxasym8u_asym8u_per_tensor_out.cpp
@@ -16,6 +16,8 @@ namespace native {
 
 using ::executorch::aten::Tensor;
 using ::executorch::runtime::KernelRuntimeContext;
+using ::impl::reference::kernels::dequantize;
+using ::impl::reference::kernels::quantize;
 
 void quantized_add_asym8uxasym8u_asym8u_per_tensor_out(
     KernelRuntimeContext& ctx,
@@ -62,24 +64,24 @@ void quantized_add_asym8uxasym8u_asym8u_per_tensor_out(
   } /* if Y is a scalar Tensor */
   else if (Y_numel == 1) {
     float y =
-        kernels::dequantize<uint8_t>(Y_data[0], Y_scale_f, Y_zero_point_i32);
+        dequantize<uint8_t>(Y_data[0], Y_scale_f, Y_zero_point_i32);
     for (size_t i = 0; i < X_numel; ++i) {
       float x =
-          kernels::dequantize<uint8_t>(X_data[i], X_scale_f, X_zero_point_i32);
+          dequantize<uint8_t>(X_data[i], X_scale_f, X_zero_point_i32);
       float z = x + y;
       out_data[i] =
-          kernels::quantize<uint8_t>(z, inv_out_scale, out_zero_point_i32);
+          quantize<uint8_t>(z, inv_out_scale, out_zero_point_i32);
     }
   } /* if X is a scalar Tensor */
   else if (X_numel == 1) {
     float x =
-        kernels::dequantize<uint8_t>(X_data[0], X_scale_f, X_zero_point_i32);
+        dequantize<uint8_t>(X_data[0], X_scale_f, X_zero_point_i32);
     for (size_t i = 0; i < Y_numel; ++i) {
       float y =
-          kernels::dequantize<uint8_t>(Y_data[i], Y_scale_f, Y_zero_point_i32);
+          dequantize<uint8_t>(Y_data[i], Y_scale_f, Y_zero_point_i32);
       float z = x + y;
       out_data[i] =
-          kernels::quantize<uint8_t>(z, inv_out_scale, out_zero_point_i32);
+          quantize<uint8_t>(z, inv_out_scale, out_zero_point_i32);
     }
   } /* other broadcasting cases */
   else {
@@ -162,13 +164,13 @@ void quantized_add_asym8uxasym8u_asym8u_per_tensor_out(
       }
 
       /* Apply the operation */
-      float x = kernels::dequantize<uint8_t>(
+      float x = dequantize<uint8_t>(
           X_data[X_idx], X_scale_f, X_zero_point_i32);
-      float y = kernels::dequantize<uint8_t>(
+      float y = dequantize<uint8_t>(
           Y_data[Y_idx], Y_scale_f, Y_zero_point_i32);
       float z = x + y;
       out_data[i] =
-          kernels::quantize<uint8_t>(z, inv_out_scale, out_zero_point_i32);
+          quantize<uint8_t>(z, inv_out_scale, out_zero_point_i32);
     }
   }
 }
diff --git a/backends/cadence/hifi/operators/op_quantized_layer_norm.cpp b/backends/cadence/hifi/operators/op_quantized_layer_norm.cpp
@@ -18,6 +18,8 @@ using ::executorch::aten::ScalarType;
 using ::executorch::aten::Tensor;
 using ::executorch::runtime::getLeadingDims;
 using ::executorch::runtime::KernelRuntimeContext;
+using ::cadence::impl::HiFi::kernels::dequantize;
+using ::cadence::impl::HiFi::kernels::quantize;
 
 namespace cadence {
 namespace impl {
@@ -80,11 +82,9 @@ void quantized_layer_norm_per_tensor_(
     for (size_t j = 0; j < last_dim; ++j) {
       // Since X is quantized, we dequantize it, compute fp32 result, and
       // quantize the result to an int8/uint8 value.
-      float val = ::cadence::impl::HiFi::kernels::dequantize<T>(
-          x[j], input_scale, input_zero_point);
+      float val = dequantize<T>(x[j], input_scale, input_zero_point);
       val = (val - mean) * inv_std * weight_data[j] + bias_data[j];
-      y[j] = ::cadence::impl::HiFi::kernels::quantize<T>(
-          val, output_inv_scale, output_zero_point);
+      y[j] = quantize<T>(val, output_inv_scale, output_zero_point);
     }
   }
 }
diff --git a/backends/cadence/reference/operators/dequantize_per_tensor.cpp b/backends/cadence/reference/operators/dequantize_per_tensor.cpp
@@ -13,9 +13,10 @@ namespace impl {
 namespace reference {
 namespace native {
 
-using executorch::aten::ScalarType;
-using executorch::aten::Tensor;
-using executorch::runtime::KernelRuntimeContext;
+using ::executorch::aten::ScalarType;
+using ::executorch::aten::Tensor;
+using ::executorch::runtime::KernelRuntimeContext;
+using ::impl::reference::kernels::dequantize;
 
 void dequantize_per_tensor_out(
     KernelRuntimeContext& context,
@@ -31,22 +32,18 @@ void dequantize_per_tensor_out(
 
   if (input.scalar_type() == ScalarType::Byte) {
     const uint8_t* input_data = input.const_data_ptr<uint8_t>();
-    impl::reference::kernels::dequantize<uint8_t>(
-        out_data, input_data, scale, zero_point, numel);
+    dequantize<uint8_t>(out_data, input_data, scale, zero_point, numel);
   } else if (input.scalar_type() == ScalarType::Char) {
     const int8_t* input_data = input.const_data_ptr<int8_t>();
-    impl::reference::kernels::dequantize<int8_t>(
-        out_data, input_data, scale, zero_point, numel);
+    dequantize<int8_t>(out_data, input_data, scale, zero_point, numel);
   } else if (
       input.scalar_type() == ScalarType::Bits16 ||
       input.scalar_type() == ScalarType::UInt16) {
     const uint16_t* input_data = input.const_data_ptr<uint16_t>();
-    impl::reference::kernels::dequantize<uint16_t>(
-        out_data, input_data, scale, zero_point, numel);
+    dequantize<uint16_t>(out_data, input_data, scale, zero_point, numel);
   } else if (input.scalar_type() == ScalarType::Short) {
     const int16_t* input_data = input.const_data_ptr<int16_t>();
-    impl::reference::kernels::dequantize<int16_t>(
-        out_data, input_data, scale, zero_point, numel);
+    dequantize<int16_t>(out_data, input_data, scale, zero_point, numel);
   } else {
     ET_CHECK_MSG(
         false,
diff --git a/backends/cadence/reference/operators/op_requantize_out.cpp b/backends/cadence/reference/operators/op_requantize_out.cpp
@@ -13,9 +13,11 @@ namespace impl {
 namespace reference {
 namespace native {
 
-using executorch::aten::ScalarType;
-using executorch::aten::Tensor;
-using executorch::runtime::KernelRuntimeContext;
+using ::executorch::aten::ScalarType;
+using ::executorch::aten::Tensor;
+using ::executorch::runtime::KernelRuntimeContext;
+using ::impl::reference::kernels::dequantize;
+using ::impl::reference::kernels::quantize;
 
 // Requantize the int8_t/uint8_t input tensor to a uint8_t/int8_t out tensor.
 // The scale and zero_point for requantization are in the args.
@@ -91,9 +93,9 @@ Tensor& requantize_out(
   dtype* out_data = out.mutable_data_ptr<dtype>();                          \
   for (size_t i = 0; i < numel; ++i) {                                      \
     float dequant =                                                         \
-        kernels::dequantize<ctype>(input_data[i], in_scale, in_zero_point); \
+        dequantize<ctype>(input_data[i], in_scale, in_zero_point); \
     out_data[i] =                                                           \
-        kernels::quantize<dtype>(dequant, 1 / out_scale, out_zero_point);   \
+        quantize<dtype>(dequant, 1 / out_scale, out_zero_point);   \
   };
 
 #define typed_requantize_in(ctype)               \
@@ -193,9 +195,9 @@ Tensor& requantize_per_tensor_out(
   dtype* out_data = out.mutable_data_ptr<dtype>();                          \
   for (size_t i = 0; i < numel; ++i) {                                      \
     float dequant =                                                         \
-        kernels::dequantize<ctype>(input_data[i], in_scale, in_zero_point); \
+        dequantize<ctype>(input_data[i], in_scale, in_zero_point); \
     out_data[i] =                                                           \
-        kernels::quantize<dtype>(dequant, 1 / out_scale, out_zero_point);   \
+        quantize<dtype>(dequant, 1 / out_scale, out_zero_point);   \
   };
 
 #define typed_requantize_in(ctype)               \
diff --git a/backends/cadence/reference/operators/quantized_add_out.cpp b/backends/cadence/reference/operators/quantized_add_out.cpp
@@ -14,8 +14,10 @@ namespace impl {
 namespace reference {
 namespace native {
 
-using executorch::aten::Tensor;
-using executorch::runtime::KernelRuntimeContext;
+using ::executorch::aten::Tensor;
+using ::executorch::runtime::KernelRuntimeContext;
+using ::impl::reference::kernels::dequantize;
+using ::impl::reference::kernels::quantize;
 
 template <typename T>
 void quantized_add_per_tensor_impl(
@@ -48,28 +50,28 @@ void quantized_add_per_tensor_impl(
   // Simple case: tensors have the same shape, no broadcasting
   if (X_numel == Y_numel && Y_numel == out_numel) {
     for (size_t i = 0; i < X_numel; ++i) {
-      float x = kernels::dequantize<T>(X_data[i], X_scale_f, X_zero_point_i32);
-      float y = kernels::dequantize<T>(Y_data[i], Y_scale_f, Y_zero_point_i32);
+      float x = dequantize<T>(X_data[i], X_scale_f, X_zero_point_i32);
+      float y = dequantize<T>(Y_data[i], Y_scale_f, Y_zero_point_i32);
       float z = x + y;
-      out_data[i] = kernels::quantize<T>(z, inv_out_scale, out_zero_point_i32);
+      out_data[i] = quantize<T>(z, inv_out_scale, out_zero_point_i32);
     }
   }
   // Y is a scalar tensor
   else if (Y_numel == 1) {
-    float y = kernels::dequantize<T>(Y_data[0], Y_scale_f, Y_zero_point_i32);
+    float y = dequantize<T>(Y_data[0], Y_scale_f, Y_zero_point_i32);
     for (size_t i = 0; i < X_numel; ++i) {
-      float x = kernels::dequantize<T>(X_data[i], X_scale_f, X_zero_point_i32);
+      float x = dequantize<T>(X_data[i], X_scale_f, X_zero_point_i32);
       float z = x + y;
-      out_data[i] = kernels::quantize<T>(z, inv_out_scale, out_zero_point_i32);
+      out_data[i] = quantize<T>(z, inv_out_scale, out_zero_point_i32);
     }
   }
   // X is a scalar tensor
   else if (X_numel == 1) {
-    float x = kernels::dequantize<T>(X_data[0], X_scale_f, X_zero_point_i32);
+    float x = dequantize<T>(X_data[0], X_scale_f, X_zero_point_i32);
     for (size_t i = 0; i < Y_numel; ++i) {
-      float y = kernels::dequantize<T>(Y_data[i], Y_scale_f, Y_zero_point_i32);
+      float y = dequantize<T>(Y_data[i], Y_scale_f, Y_zero_point_i32);
       float z = x + y;
-      out_data[i] = kernels::quantize<T>(z, inv_out_scale, out_zero_point_i32);
+      out_data[i] = quantize<T>(z, inv_out_scale, out_zero_point_i32);
     }
   }
   // General broadcasting case - simplified implementation
@@ -80,11 +82,11 @@ void quantized_add_per_tensor_impl(
       size_t y_idx = (Y_numel == 1) ? 0 : i % Y_numel;
 
       float x =
-          kernels::dequantize<T>(X_data[x_idx], X_scale_f, X_zero_point_i32);
+          dequantize<T>(X_data[x_idx], X_scale_f, X_zero_point_i32);
       float y =
-          kernels::dequantize<T>(Y_data[y_idx], Y_scale_f, Y_zero_point_i32);
+          dequantize<T>(Y_data[y_idx], Y_scale_f, Y_zero_point_i32);
       float z = x + y;
-      out_data[i] = kernels::quantize<T>(z, inv_out_scale, out_zero_point_i32);
+      out_data[i] = quantize<T>(z, inv_out_scale, out_zero_point_i32);
     }
   }
 }
diff --git a/backends/cadence/reference/operators/quantized_layer_norm.cpp b/backends/cadence/reference/operators/quantized_layer_norm.cpp
@@ -16,6 +16,9 @@ using ::executorch::aten::ScalarType;
 using ::executorch::aten::Tensor;
 using ::executorch::runtime::getLeadingDims;
 using ::executorch::runtime::KernelRuntimeContext;
+using ::impl::reference::kernels::dequantize;
+using ::impl::reference::kernels::quantize;
+
 
 namespace impl {
 namespace reference {
@@ -74,10 +77,10 @@ void quantized_layer_norm_per_tensor_(
       // y[j] = (x[j] - mean) / std * kGamma + kBeta;
       // Since X is quantized, we dequantize it, compute fp32 result, and
       // quantize the result to an int8/uint8 value.
-      float val = kernels::dequantize<T>(x[j], input_scale, input_zero_point);
+      float val = dequantize<T>(x[j], input_scale, input_zero_point);
 
       val = (val - mean) * inv_std * weight_data[j] + bias_data[j];
-      y[j] = kernels::quantize<T>(val, output_inv_scale, output_zero_point);
+      y[j] = quantize<T>(val, output_inv_scale, output_zero_point);
     }
   }
 }