diff --git a/backends/cadence/aot/quantizer/quantizer.py b/backends/cadence/aot/quantizer/quantizer.py index 70b16b86fda..e0256437022 100644 --- a/backends/cadence/aot/quantizer/quantizer.py +++ b/backends/cadence/aot/quantizer/quantizer.py @@ -372,3 +372,30 @@ def __init__(self, quantizers: Optional[list[Quantizer]] = None) -> None: # Add 16-bit quantizers for LinearPattern quantizers.append(CadenceAtenQuantizer(LinearPattern(), qconfig_A16)) super().__init__(quantizers) + + +class CadenceWith16BitConvActivationsQuantizer(CadenceQuantizer): + """ + Quantizer including A16 conv + """ + + def __init__(self, quantizers: Optional[list[Quantizer]] = None) -> None: + if quantizers is None: + quantizers = [] + # Add 16-bit quantizers for Conv patterns + quantizers.append(CadenceAtenQuantizer(Conv1dPattern(), qconfig_A16)) + quantizers.append(CadenceAtenQuantizer(Conv2dPattern(), qconfig_A16)) + super().__init__(quantizers) + + +class CadenceWith16BitMatmulActivationsQuantizer(CadenceQuantizer): + """ + Quantizer including A16 matmul + """ + + def __init__(self, quantizers: Optional[list[Quantizer]] = None) -> None: + if quantizers is None: + quantizers = [] + # Add 16-bit quantizers for MatmulPattern + quantizers.append(CadenceAtenQuantizer(MatmulPattern(), qconfig_A16)) + super().__init__(quantizers) diff --git a/backends/cadence/hifi/operators/op_quantized_matmul_out.cpp b/backends/cadence/hifi/operators/op_quantized_matmul_out.cpp index 90fe483660b..732f302f9d3 100644 --- a/backends/cadence/hifi/operators/op_quantized_matmul_out.cpp +++ b/backends/cadence/hifi/operators/op_quantized_matmul_out.cpp @@ -8,6 +8,7 @@ #include #include +#include #include using executorch::aten::ScalarType; @@ -192,8 +193,20 @@ void quantized_matmul_out( size_t leading_dim = X.size(X.dim() - 2); size_t out_dim = Y.size(Y.dim() - 1 - transposed); size_t in_dim = X.size(X.dim() - 1); - - if (out.scalar_type() == exec_aten::ScalarType::Byte) { + if (out.scalar_type() == exec_aten::ScalarType::Short) { + ::impl::generic::native::quantized_matmul_out( + ctx, + X, + X_zero_point, + Y, + Y_zero_point, + bias, + out_multiplier, + out_shift, + out_zero_point, + transposed, + out); + } else if (out.scalar_type() == exec_aten::ScalarType::Byte) { _typed_quantized_matmul( ctx, X, diff --git a/backends/cadence/hifi/operators/operators.h b/backends/cadence/hifi/operators/operators.h index f7f5194d91a..e73baa1830c 100644 --- a/backends/cadence/hifi/operators/operators.h +++ b/backends/cadence/hifi/operators/operators.h @@ -83,6 +83,19 @@ void quantized_linear_per_tensor_out( const ::executorch::aten::optional<::executorch::aten::Tensor>& offset, ::executorch::aten::Tensor& out); +void quantized_matmul_out( + ::executorch::runtime::KernelRuntimeContext& ctx, + const ::executorch::aten::Tensor& X, + int64_t X_zero_point, + const ::executorch::aten::Tensor& Y, + int64_t Y_zero_point, + const ::executorch::aten::optional<::executorch::aten::Tensor>& bias, + int64_t out_multiplier, + int64_t out_shift, + int64_t out_zero_point, + bool transposed, + ::executorch::aten::Tensor& out); + void quantized_conv2d_nhwc_out( ::executorch::runtime::KernelRuntimeContext& ctx, const ::executorch::aten::Tensor& input, diff --git a/backends/cadence/hifi/operators/targets.bzl b/backends/cadence/hifi/operators/targets.bzl index a25dfd1bcbc..81fb4ee73dd 100644 --- a/backends/cadence/hifi/operators/targets.bzl +++ b/backends/cadence/hifi/operators/targets.bzl @@ -90,7 +90,6 @@ OPERATORS = [ "quantized_linear_out", "quantized_linear_asym8sxasym8s_asym8s_per_tensor_out", "quantized_linear_asym8uxasym8u_asym8u_per_tensor_out", - "quantized_matmul_out", "quantized_matmul_asym8sxasym8s_asym8s_out", "quantized_matmul_asym8uxasym8u_asym8u_out", "quantized_relu_out", @@ -122,3 +121,6 @@ def define_common_targets(): # Define build targets for all operators registered in the tables above. for op in OPERATORS: define_operator(op) + + # quantized_matmul_out needs additional dependency for int16 support + define_operator("quantized_matmul_out", deps=["fbcode//on_device_ai/Assistant/Jarvis/min_runtime/operators:quantize_matmul_out", "fbcode//on_device_ai/Assistant/Jarvis/min_runtime/operators:headers",]) diff --git a/backends/cadence/hifi/operators/tests/test_op_quantized_matmul_out.cpp b/backends/cadence/hifi/operators/tests/test_op_quantized_matmul_out.cpp new file mode 100644 index 00000000000..0bee282fabe --- /dev/null +++ b/backends/cadence/hifi/operators/tests/test_op_quantized_matmul_out.cpp @@ -0,0 +1,145 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include +#include + +#include +#include +#include +#include +#include +#include + +#include + +namespace impl { +namespace HiFi { +namespace native { +namespace { + +using ::executorch::aten::Scalar; +using ::executorch::aten::ScalarType; +using ::executorch::aten::Tensor; +using ::executorch::aten::TensorImpl; +using ::executorch::runtime::Error; +using ::executorch::runtime::KernelRuntimeContext; +using ::executorch::runtime::runtime_init; +using ::executorch::runtime::testing::TensorFactory; + +class HiFiQuantizedMatmulTest : public OperatorTest { + public: + protected: + void quantized_matmul_out( + const Tensor& X, + int64_t X_zero_point, + const Tensor& Y, + int64_t Y_zero_point, + const std::optional& bias, + int64_t out_multiplier, + int64_t out_shift, + int64_t out_zero_point, + bool transposed, + Tensor& output) { + return ::impl::HiFi::native::quantized_matmul_out( + context_, + X, + X_zero_point, + Y, + Y_zero_point, + bias, + out_multiplier, + out_shift, + out_zero_point, + transposed, + output); + } +}; + +// Test quantized_matmul_out with int16 activations and int8 weights +TEST_F(HiFiQuantizedMatmulTest, QuantizedMatmulInt16Test) { + TensorFactory tf_int16; + TensorFactory tf_int32; + TensorFactory tf_int8; + + // Simple 2D case: X [64, 33] x Y [33, 128] = output [64, 128] + // Using simple values for testing + Tensor X = tf_int16.ones({64, 33}); + Tensor Y = tf_int8.ones({33, 128}); + // Bias not used + Tensor bias = tf_int32.full({128}, -30); + Tensor output = tf_int16.zeros({64, 128}); + + int64_t X_zero_point = 0; + int64_t Y_zero_point = 0; + int64_t out_multiplier = 1073741824; // 0.5 * 2^31 + int64_t out_shift = 0; + int64_t out_zero_point = 0; + + quantized_matmul_out( + X, + X_zero_point, + Y, + Y_zero_point, + bias, // pass bias tensor + out_multiplier, + out_shift, + out_zero_point, + false, // transposed + output); + + // Verify the output is correct + // With all ones input and weights, inner dimension is 33 + // Matmul result: 33, with out_multiplier = 0.5 * 2^31 (scales by 0.5) + // Expected value: 33 * 0.5 = 16.5 ≈ 16 + EXPECT_EQ(output.const_data_ptr()[0], 16); +} + +// Test quantized_matmul_out with transposed Y (int16 activations and int8 +// weights) +TEST_F(HiFiQuantizedMatmulTest, QuantizedMatmulInt16TransposedTest) { + TensorFactory tf_int16; + TensorFactory tf_int32; + TensorFactory tf_int8; + + // Transposed case: X [64, 33] x Y^T [128, 33] = output [64, 128] + Tensor X = tf_int16.ones({64, 33}); + Tensor Y = tf_int8.ones({128, 33}); // Transposed + // Bias not used + Tensor bias = tf_int32.full({128}, -30); + Tensor output = tf_int16.zeros({64, 128}); + + int64_t X_zero_point = 0; + int64_t Y_zero_point = 0; + int64_t out_multiplier = 1073741824; // 0.5 * 2^31 + int64_t out_shift = 0; + int64_t out_zero_point = 0; + + quantized_matmul_out( + X, + X_zero_point, + Y, + Y_zero_point, + bias, // pass bias tensor + out_multiplier, + out_shift, + out_zero_point, + true, // transposed + output); + + // Verify the output is correct + // With all ones input and weights, inner dimension is 33 + // Matmul result: 33, with out_multiplier = 0.5 * 2^31 (scales by 0.5) + // Expected value: 33 * 0.5 = 16.5 ≈ 16 + EXPECT_EQ(output.const_data_ptr()[0], 16); +} + +} // namespace +} // namespace native +} // namespace HiFi +} // namespace impl