Skip to content
Merged
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 10 additions & 0 deletions backends/cadence/aot/functions.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -219,6 +219,16 @@
- arg_meta: null
kernel_name: impl::reference::quantized_relu_per_tensor_out

- func: cadence::quantized_relu_asym8s_asym8s.per_tensor_out(Tensor X, int X_zero_point, int out_zero_point, int out_multiplier, int out_shift, *, Tensor(a!) out) -> Tensor(a!)
kernels:
- arg_meta: null
kernel_name: impl::reference::quantized_relu_asym8s_asym8s_per_tensor_out

- func: cadence::quantized_relu_asym8u_asym8u.per_tensor_out(Tensor X, int X_zero_point, int out_zero_point, int out_multiplier, int out_shift, *, Tensor(a!) out) -> Tensor(a!)
kernels:
- arg_meta: null
kernel_name: impl::reference::quantized_relu_asym8u_asym8u_per_tensor_out

- func: cadence::quantized_matmul.out(Tensor X, int X_zero_point, Tensor Y, int Y_zero_point, Tensor? bias, int out_multiplier, int out_shift, int out_zero_point, bool transposed, *, Tensor(a!) out) -> Tensor(a!)
kernels:
- arg_meta: null
Expand Down
10 changes: 10 additions & 0 deletions backends/cadence/aot/functions_hifi.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -339,6 +339,16 @@
- arg_meta: null
kernel_name: cadence::impl::HiFi::quantized_relu_per_tensor_out

- func: cadence::quantized_relu_asym8s_asym8s.per_tensor_out(Tensor X, int X_zero_point, int out_zero_point, int out_multiplier, int out_shift, *, Tensor(a!) out) -> Tensor(a!)
kernels:
- arg_meta: null
kernel_name: cadence::impl::HiFi::quantized_relu_asym8s_asym8s_per_tensor_out

- func: cadence::quantized_relu_asym8u_asym8u.per_tensor_out(Tensor X, int X_zero_point, int out_zero_point, int out_multiplier, int out_shift, *, Tensor(a!) out) -> Tensor(a!)
kernels:
- arg_meta: null
kernel_name: cadence::impl::HiFi::quantized_relu_asym8u_asym8u_per_tensor_out

- func: cadence::quantized_matmul.out(Tensor X, int X_zero_point, Tensor Y, int Y_zero_point, Tensor? bias, int out_multiplier, int out_shift, int out_zero_point, bool transposed, *, Tensor(a!) out) -> Tensor(a!)
kernels:
- arg_meta: null
Expand Down
36 changes: 36 additions & 0 deletions backends/cadence/aot/ops_registrations.py
Original file line number Diff line number Diff line change
Expand Up @@ -232,6 +232,20 @@
"quantized_relu.per_tensor_out(Tensor X, int X_zero_point, int out_zero_point, int out_multiplier, "
"int out_shift, *, Tensor(a!) out) -> Tensor(a!)"
)
lib.define(
"quantized_relu_asym8s_asym8s.per_tensor(Tensor X, int X_zero_point, int out_zero_point, int out_multiplier, int out_shift) -> Tensor"
)
lib.define(
"quantized_relu_asym8s_asym8s.per_tensor_out(Tensor X, int X_zero_point, int out_zero_point, int out_multiplier, "
"int out_shift, *, Tensor(a!) out) -> Tensor(a!)"
)
lib.define(
"quantized_relu_asym8u_asym8u.per_tensor(Tensor X, int X_zero_point, int out_zero_point, int out_multiplier, int out_shift) -> Tensor"
)
lib.define(
"quantized_relu_asym8u_asym8u.per_tensor_out(Tensor X, int X_zero_point, int out_zero_point, int out_multiplier, "
"int out_shift, *, Tensor(a!) out) -> Tensor(a!)"
)
lib.define(
"quantized_add.out(Tensor X, Tensor X_scale, Tensor X_zero_point, Tensor Y, Tensor Y_scale, "
"Tensor Y_zero_point, float out_scale, int out_zero_point, *, Tensor(a!) out) -> Tensor(a!)"
Expand Down Expand Up @@ -770,6 +784,28 @@ def quantized_relu_per_tensor_meta(
return input.new_empty(input.size(), dtype=input.dtype)


@register_fake("cadence::quantized_relu_asym8s_asym8s.per_tensor")
def quantized_relu_asym8s_asym8s_per_tensor_meta(
input: torch.Tensor,
in_zero_point: int,
out_zero_point: int,
out_multiplier: int,
out_shift: int,
) -> torch.Tensor:
return input.new_empty(input.size(), dtype=input.dtype)


@register_fake("cadence::quantized_relu_asym8u_asym8u.per_tensor")
def quantized_relu_asym8u_asym8u_per_tensor_meta(
input: torch.Tensor,
in_zero_point: int,
out_zero_point: int,
out_multiplier: int,
out_shift: int,
) -> torch.Tensor:
return input.new_empty(input.size(), dtype=input.dtype)


@register_fake("cadence::fully_connected")
def fully_connected_meta(
src: torch.Tensor,
Expand Down
48 changes: 48 additions & 0 deletions backends/cadence/aot/tests/test_type_dispatch_passes.py
Original file line number Diff line number Diff line change
Expand Up @@ -137,3 +137,51 @@ def test_mixed_types_error(self) -> None:
with self.assertRaises(RuntimeError) as context:
cast(PassResult, p(gm)).graph_module
self.assertIn("Unsupported input types", str(context.exception))

def test_int8_dispatch_quantized_relu(self) -> None:
"""Test int8 input should dispatch to asym8s_asym8s variant for quantized_relu"""
x = torch.randint(-128, 127, (2, 3), dtype=torch.int8)
gm = single_op_builder(
placeholders=(x,),
op=exir_ops.edge.cadence.quantized_relu.per_tensor,
args=(x, 0, 0, 1, 0),
)
p = CompileTimeTypeDispatchPass()
gm = cast(PassResult, p(gm)).graph_module
# Original op should be replaced
self.assertEqual(
count_node(gm, exir_ops.edge.cadence.quantized_relu.per_tensor),
0,
)
# Should be replaced with int8 specific variant
self.assertEqual(
count_node(
gm,
exir_ops.edge.cadence.quantized_relu_asym8s_asym8s.per_tensor,
),
1,
)

def test_uint8_dispatch_quantized_relu(self) -> None:
"""Test uint8 input should dispatch to asym8u_asym8u variant for quantized_relu"""
x = torch.randint(0, 255, (2, 3), dtype=torch.uint8)
gm = single_op_builder(
placeholders=(x,),
op=exir_ops.edge.cadence.quantized_relu.per_tensor,
args=(x, 0, 0, 1, 0),
)
p = CompileTimeTypeDispatchPass()
gm = cast(PassResult, p(gm)).graph_module
# Original op should be replaced
self.assertEqual(
count_node(gm, exir_ops.edge.cadence.quantized_relu.per_tensor),
0,
)
# Should be replaced with uint8 specific variant
self.assertEqual(
count_node(
gm,
exir_ops.edge.cadence.quantized_relu_asym8u_asym8u.per_tensor,
),
1,
)
57 changes: 40 additions & 17 deletions backends/cadence/aot/type_dispatch.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,40 +23,63 @@ class CompileTimeTypeDispatchPass(ExportPass):
Replaces generic ops with ops that have explicit types.
"""

_TYPE_DISPATCH_MAP: dict[tuple[torch.dtype, torch.dtype], str] = {
_BINARY_TYPE_DISPATCH_MAP: dict[tuple[torch.dtype, torch.dtype], str] = {
(torch.int8, torch.int8): "asym8sxasym8s_asym8s",
(torch.uint8, torch.uint8): "asym8uxasym8u_asym8u",
}

_SUPPORTED_OPS: dict[OpOverload, str] = {
_UNARY_TYPE_DISPATCH_MAP: dict[torch.dtype, str] = {
torch.int8: "asym8s_asym8s",
torch.uint8: "asym8u_asym8u",
}

_BINARY_SUPPORTED_OPS: dict[OpOverload, str] = {
exir_ops.edge.cadence.quantized_fully_connected.per_tensor: "quantized_fully_connected",
exir_ops.edge.cadence.quantized_linear.per_tensor: "quantized_linear",
}

_SUPPORTED_UNARY_OPS: dict[OpOverload, str] = {
exir_ops.edge.cadence.quantized_relu.per_tensor: "quantized_relu",
}

def call_operator(
self,
op: OpOverload,
args: tuple[Argument, ...],
kwargs: dict[str, Argument],
meta: NodeMetadata,
) -> ProxyValue:
if op not in self._SUPPORTED_OPS:
return super().call_operator(op, args, kwargs, meta)
if op in self._BINARY_SUPPORTED_OPS:
# pyre-ignore[16]: None has no attribute `to_tensor`.
input_dtype = args[0].to_tensor().dtype
weight_dtype = args[1].to_tensor().dtype
dtype_pair = (input_dtype, weight_dtype)

if dtype_pair not in self._BINARY_TYPE_DISPATCH_MAP:
raise RuntimeError(
f"Unsupported input types for {op}: {input_dtype} and {weight_dtype}"
)

base_op_name = self._BINARY_SUPPORTED_OPS[op]
type_suffix = self._BINARY_TYPE_DISPATCH_MAP[dtype_pair]

typed_op_name = f"{base_op_name}_{type_suffix}"
typed_op = getattr(exir_ops.edge.cadence, typed_op_name).per_tensor

return super().call_operator(typed_op, args, kwargs, meta)

elif op in self._SUPPORTED_UNARY_OPS:
input_dtype = args[0].to_tensor().dtype

# pyre-ignore[16]: None has no attribute `to_tensor`.
input_dtype = args[0].to_tensor().dtype
weight_dtype = args[1].to_tensor().dtype
dtype_pair = (input_dtype, weight_dtype)
if input_dtype not in self._UNARY_TYPE_DISPATCH_MAP:
raise RuntimeError(f"Unsupported input type for {op}: {input_dtype}")

if dtype_pair not in self._TYPE_DISPATCH_MAP:
raise RuntimeError(
f"Unsupported input types for {op}: {input_dtype} and {weight_dtype}"
)
base_op_name = self._SUPPORTED_UNARY_OPS[op]
type_suffix = self._UNARY_TYPE_DISPATCH_MAP[input_dtype]

base_op_name = self._SUPPORTED_OPS[op]
type_suffix = self._TYPE_DISPATCH_MAP[dtype_pair]
typed_op_name = f"{base_op_name}_{type_suffix}"
typed_op = getattr(exir_ops.edge.cadence, typed_op_name).per_tensor

typed_op_name = f"{base_op_name}_{type_suffix}"
typed_op = getattr(exir_ops.edge.cadence, typed_op_name).per_tensor
return super().call_operator(typed_op, args, kwargs, meta)

return super().call_operator(typed_op, args, kwargs, meta)
return super().call_operator(op, args, kwargs, meta)
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
/*
* Copyright (c) Meta Platforms, Inc. and affiliates.
* All rights reserved.
*
* This source code is licensed under the BSD-style license found in the
* LICENSE file in the root directory of this source tree.
*/

#include <executorch/backends/cadence/hifi/kernels/kernels.h>
#include <executorch/runtime/kernel/kernel_includes.h>
#include <xa_nnlib_kernels_api.h>

namespace cadence {
namespace impl {
namespace HiFi {
namespace native {

using ::executorch::aten::Tensor;
using ::executorch::runtime::KernelRuntimeContext;

void quantized_relu_asym8s_asym8s_per_tensor_out(
KernelRuntimeContext& ctx,
const Tensor& input,
const int64_t in_zero_point,
const int64_t out_zero_point,
const int64_t out_multiplier,
const int64_t out_shift,
Tensor& output) {
const int8_t* __restrict__ input_data = input.const_data_ptr<int8_t>();
int8_t* __restrict__ output_data = output.mutable_data_ptr<int8_t>();

const int32_t out_multipler_int32 = static_cast<int32_t>(out_multiplier);
const int32_t out_shift_int32 = static_cast<int32_t>(out_shift);

const int32_t ret = xa_nn_vec_relu_asym8s_asym8s(
output_data,
input_data,
in_zero_point,
out_multipler_int32,
out_shift_int32,
out_zero_point,
-128,
127,
input.numel());
ET_DCHECK_MSG(
ret == 0, "HiFi quantized_relu_asym8s_asym8s_per_tensor failed");
}

} // namespace native
} // namespace HiFi
} // namespace impl
} // namespace cadence
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
/*
* Copyright (c) Meta Platforms, Inc. and affiliates.
* All rights reserved.
*
* This source code is licensed under the BSD-style license found in the
* LICENSE file in the root directory of this source tree.
*/

#include <executorch/backends/cadence/hifi/kernels/kernels.h>
#include <executorch/runtime/kernel/kernel_includes.h>
#include <xa_nnlib_kernels_api.h>

namespace cadence {
namespace impl {
namespace HiFi {
namespace native {

using ::executorch::aten::Tensor;
using ::executorch::runtime::KernelRuntimeContext;

void quantized_relu_asym8u_asym8u_per_tensor_out(
KernelRuntimeContext& ctx,
const Tensor& input,
const int64_t in_zero_point,
const int64_t out_zero_point,
const int64_t out_multiplier,
const int64_t out_shift,
Tensor& output) {
const uint8_t* __restrict__ input_data = input.const_data_ptr<uint8_t>();
uint8_t* __restrict__ output_data = output.mutable_data_ptr<uint8_t>();

const int32_t out_multipler_int32 = static_cast<int32_t>(out_multiplier);
const int32_t out_shift_int32 = static_cast<int32_t>(out_shift);

const int32_t ret = xa_nn_vec_relu_asym8u_asym8u(
output_data,
input_data,
in_zero_point,
out_multipler_int32,
out_shift_int32,
_out_zero_point,
0,
255,
input.numel());
ET_DCHECK_MSG(
ret == 0, "HiFi quantized_relu_asym8u_asym8u_per_tensor failed");
}

} // namespace native
} // namespace HiFi
} // namespace impl
} // namespace cadence
2 changes: 2 additions & 0 deletions backends/cadence/hifi/operators/targets.bzl
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,8 @@ OPERATORS = [
"quantized_linear_asym8uxasym8u_asym8u_per_tensor_out",
"quantized_matmul_out",
"quantized_relu_out",
"quantized_relu_asym8s_asym8s_per_tensor_out",
"quantized_relu_asym8u_asym8u_per_tensor_out",
"quantize_per_tensor",
"remainder",
"rsqrt",
Expand Down
Loading
Loading