Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
27 changes: 27 additions & 0 deletions backends/cadence/aot/quantizer/quantizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -372,3 +372,30 @@ def __init__(self, quantizers: Optional[list[Quantizer]] = None) -> None:
# Add 16-bit quantizers for LinearPattern
quantizers.append(CadenceAtenQuantizer(LinearPattern(), qconfig_A16))
super().__init__(quantizers)


class CadenceWith16BitConvActivationsQuantizer(CadenceQuantizer):
"""
Quantizer including A16 conv
"""

def __init__(self, quantizers: Optional[list[Quantizer]] = None) -> None:
if quantizers is None:
quantizers = []
# Add 16-bit quantizers for Conv patterns
quantizers.append(CadenceAtenQuantizer(Conv1dPattern(), qconfig_A16))
quantizers.append(CadenceAtenQuantizer(Conv2dPattern(), qconfig_A16))
super().__init__(quantizers)


class CadenceWith16BitMatmulActivationsQuantizer(CadenceQuantizer):
"""
Quantizer including A16 matmul
"""

def __init__(self, quantizers: Optional[list[Quantizer]] = None) -> None:
if quantizers is None:
quantizers = []
# Add 16-bit quantizers for MatmulPattern
quantizers.append(CadenceAtenQuantizer(MatmulPattern(), qconfig_A16))
super().__init__(quantizers)
17 changes: 15 additions & 2 deletions backends/cadence/hifi/operators/op_quantized_matmul_out.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@

#include <executorch/backends/cadence/hifi/kernels/kernels.h>
#include <executorch/runtime/kernel/kernel_includes.h>
#include <on_device_ai/Assistant/Jarvis/min_runtime/operators/generic/operators.h>
#include <stdlib.h>

using executorch::aten::ScalarType;
Expand Down Expand Up @@ -192,8 +193,20 @@ void quantized_matmul_out(
size_t leading_dim = X.size(X.dim() - 2);
size_t out_dim = Y.size(Y.dim() - 1 - transposed);
size_t in_dim = X.size(X.dim() - 1);

if (out.scalar_type() == exec_aten::ScalarType::Byte) {
if (out.scalar_type() == exec_aten::ScalarType::Short) {
::impl::generic::native::quantized_matmul_out(
ctx,
X,
X_zero_point,
Y,
Y_zero_point,
bias,
out_multiplier,
out_shift,
out_zero_point,
transposed,
out);
} else if (out.scalar_type() == exec_aten::ScalarType::Byte) {
_typed_quantized_matmul<uint8_t>(
ctx,
X,
Expand Down
13 changes: 13 additions & 0 deletions backends/cadence/hifi/operators/operators.h
Original file line number Diff line number Diff line change
Expand Up @@ -83,6 +83,19 @@ void quantized_linear_per_tensor_out(
const ::executorch::aten::optional<::executorch::aten::Tensor>& offset,
::executorch::aten::Tensor& out);

void quantized_matmul_out(
::executorch::runtime::KernelRuntimeContext& ctx,
const ::executorch::aten::Tensor& X,
int64_t X_zero_point,
const ::executorch::aten::Tensor& Y,
int64_t Y_zero_point,
const ::executorch::aten::optional<::executorch::aten::Tensor>& bias,
int64_t out_multiplier,
int64_t out_shift,
int64_t out_zero_point,
bool transposed,
::executorch::aten::Tensor& out);

void quantized_conv2d_nhwc_out(
::executorch::runtime::KernelRuntimeContext& ctx,
const ::executorch::aten::Tensor& input,
Expand Down
4 changes: 3 additions & 1 deletion backends/cadence/hifi/operators/targets.bzl
Original file line number Diff line number Diff line change
Expand Up @@ -90,7 +90,6 @@ OPERATORS = [
"quantized_linear_out",
"quantized_linear_asym8sxasym8s_asym8s_per_tensor_out",
"quantized_linear_asym8uxasym8u_asym8u_per_tensor_out",
"quantized_matmul_out",
"quantized_matmul_asym8sxasym8s_asym8s_out",
"quantized_matmul_asym8uxasym8u_asym8u_out",
"quantized_relu_out",
Expand Down Expand Up @@ -122,3 +121,6 @@ def define_common_targets():
# Define build targets for all operators registered in the tables above.
for op in OPERATORS:
define_operator(op)

# quantized_matmul_out needs additional dependency for int16 support
define_operator("quantized_matmul_out", deps=["fbcode//on_device_ai/Assistant/Jarvis/min_runtime/operators:quantize_matmul_out", "fbcode//on_device_ai/Assistant/Jarvis/min_runtime/operators:headers",])
Copy link

Copilot AI Nov 20, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The deps list has a trailing comma after the closing bracket which is unusual formatting. While this may not cause a syntax error in Starlark, it's inconsistent with standard formatting practices. Consider reformatting as:

define_operator("quantized_matmul_out", deps=["fbcode//on_device_ai/Assistant/Jarvis/min_runtime/operators:quantize_matmul_out", "fbcode//on_device_ai/Assistant/Jarvis/min_runtime/operators:headers"])
Suggested change
define_operator("quantized_matmul_out", deps=["fbcode//on_device_ai/Assistant/Jarvis/min_runtime/operators:quantize_matmul_out", "fbcode//on_device_ai/Assistant/Jarvis/min_runtime/operators:headers",])
define_operator("quantized_matmul_out", deps=["fbcode//on_device_ai/Assistant/Jarvis/min_runtime/operators:quantize_matmul_out", "fbcode//on_device_ai/Assistant/Jarvis/min_runtime/operators:headers"])

Copilot uses AI. Check for mistakes.
145 changes: 145 additions & 0 deletions backends/cadence/hifi/operators/tests/test_op_quantized_matmul_out.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,145 @@
/*
* Copyright (c) Meta Platforms, Inc. and affiliates.
* All rights reserved.
*
* This source code is licensed under the BSD-style license found in the
* LICENSE file in the root directory of this source tree.
*/

#include <gtest/gtest.h>
#include <sys/times.h>

#include <executorch/kernels/test/TestUtil.h>
#include <executorch/runtime/core/error.h>
#include <executorch/runtime/core/exec_aten/exec_aten.h>
#include <executorch/runtime/core/exec_aten/testing_util/tensor_factory.h>
#include <executorch/runtime/core/exec_aten/testing_util/tensor_util.h>
#include <executorch/runtime/platform/runtime.h>

#include <executorch/backends/cadence/hifi/operators/operators.h>

namespace impl {
namespace HiFi {
namespace native {
namespace {

using ::executorch::aten::Scalar;
using ::executorch::aten::ScalarType;
using ::executorch::aten::Tensor;
using ::executorch::aten::TensorImpl;
using ::executorch::runtime::Error;
using ::executorch::runtime::KernelRuntimeContext;
using ::executorch::runtime::runtime_init;
using ::executorch::runtime::testing::TensorFactory;

class HiFiQuantizedMatmulTest : public OperatorTest {
public:
protected:
void quantized_matmul_out(
const Tensor& X,
int64_t X_zero_point,
const Tensor& Y,
int64_t Y_zero_point,
const std::optional<Tensor>& bias,
int64_t out_multiplier,
int64_t out_shift,
int64_t out_zero_point,
bool transposed,
Tensor& output) {
return ::impl::HiFi::native::quantized_matmul_out(
context_,
X,
X_zero_point,
Y,
Y_zero_point,
bias,
out_multiplier,
out_shift,
out_zero_point,
transposed,
output);
}
};

// Test quantized_matmul_out with int16 activations and int8 weights
TEST_F(HiFiQuantizedMatmulTest, QuantizedMatmulInt16Test) {
TensorFactory<ScalarType::Short> tf_int16;
TensorFactory<ScalarType::Int> tf_int32;
TensorFactory<ScalarType::Char> tf_int8;

// Simple 2D case: X [64, 33] x Y [33, 128] = output [64, 128]
// Using simple values for testing
Tensor X = tf_int16.ones({64, 33});
Tensor Y = tf_int8.ones({33, 128});
// Bias not used
Tensor bias = tf_int32.full({128}, -30);
Tensor output = tf_int16.zeros({64, 128});

int64_t X_zero_point = 0;
int64_t Y_zero_point = 0;
int64_t out_multiplier = 1073741824; // 0.5 * 2^31
int64_t out_shift = 0;
int64_t out_zero_point = 0;

quantized_matmul_out(
X,
X_zero_point,
Y,
Y_zero_point,
bias, // pass bias tensor
out_multiplier,
out_shift,
out_zero_point,
false, // transposed
output);

// Verify the output is correct
// With all ones input and weights, inner dimension is 33
// Matmul result: 33, with out_multiplier = 0.5 * 2^31 (scales by 0.5)
// Expected value: 33 * 0.5 = 16.5 ≈ 16
EXPECT_EQ(output.const_data_ptr<int16_t>()[0], 16);
}

// Test quantized_matmul_out with transposed Y (int16 activations and int8
// weights)
TEST_F(HiFiQuantizedMatmulTest, QuantizedMatmulInt16TransposedTest) {
TensorFactory<ScalarType::Short> tf_int16;
TensorFactory<ScalarType::Int> tf_int32;
TensorFactory<ScalarType::Char> tf_int8;

// Transposed case: X [64, 33] x Y^T [128, 33] = output [64, 128]
Tensor X = tf_int16.ones({64, 33});
Tensor Y = tf_int8.ones({128, 33}); // Transposed
// Bias not used
Tensor bias = tf_int32.full({128}, -30);
Tensor output = tf_int16.zeros({64, 128});

int64_t X_zero_point = 0;
int64_t Y_zero_point = 0;
int64_t out_multiplier = 1073741824; // 0.5 * 2^31
int64_t out_shift = 0;
int64_t out_zero_point = 0;

quantized_matmul_out(
X,
X_zero_point,
Y,
Y_zero_point,
bias, // pass bias tensor
out_multiplier,
out_shift,
out_zero_point,
true, // transposed
output);

// Verify the output is correct
// With all ones input and weights, inner dimension is 33
// Matmul result: 33, with out_multiplier = 0.5 * 2^31 (scales by 0.5)
// Expected value: 33 * 0.5 = 16.5 ≈ 16
EXPECT_EQ(output.const_data_ptr<int16_t>()[0], 16);
}

} // namespace
} // namespace native
} // namespace HiFi
} // namespace impl
Loading