Skip to content

Commit 40d03e1

Browse files
RahulC7facebook-github-bot
authored andcommitted
2/n Enable 16-bit activations and 8-bit weights in Cadence Quantizer for linear (#15901)
Summary: # Context We continue from D84284794 to add support for 16-bit activations. Note that right now, all though they support 16-bit activations already, it's only if the weights are also 16-bits. To do this, we need to change the way we template some functions. # Current Behavior Right now, we're composing two macros together, the `ET_FORALL_JARVIS_QUANTIZED_TYPES_WITH_INT16` macro: https://www.internalfb.com/code/fbsource/[9e8c6d8466107f58aa3de1b9e4ec71c49d670a8f]/fbcode/on_device_ai/Assistant/Jarvis/min_runtime/operators/generic/operators.h?lines=22-25 and the function macro(`quantized_linear` chosen for example): https://www.internalfb.com/code/fbsource/[9e8c6d8466107f58aa3de1b9e4ec71c49d670a8f]/fbcode/on_device_ai/Assistant/Jarvis/min_runtime/operators/generic/quantized_linear_out.cpp?lines=30-41 so together, it just becomes a switch statement, calling the `quantized_linear` function with the correct template parameter. However, note that it assumes that both the input activations and weights are the same dtype, which is not the case. # This Diff We fix the generic implementation by allowing there to be two generics, one for the weight and one for the input activations. Reviewed By: hsharma35 Differential Revision: D86538176
1 parent d2c011e commit 40d03e1

File tree

3 files changed

+175
-5
lines changed

3 files changed

+175
-5
lines changed

backends/cadence/hifi/operators/op_quantized_linear_out.cpp

Lines changed: 39 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99
#include <executorch/backends/cadence/hifi/kernels/kernels.h>
1010
#include <executorch/backends/cadence/hifi/operators/operators.h>
1111
#include <executorch/runtime/kernel/kernel_includes.h>
12+
#include <on_device_ai/Assistant/Jarvis/min_runtime/operators/generic/operators.h>
1213
#include <xa_nnlib_kernels_api.h>
1314
#include <xtensa/tie/xt_datacache.h>
1415
#include <algorithm>
@@ -207,7 +208,7 @@ void inline _quantized_linear_per_tensor_asym8s(
207208
}
208209

209210
void quantized_linear_out(
210-
__ET_UNUSED KernelRuntimeContext& ctx,
211+
KernelRuntimeContext& ctx,
211212
const Tensor& in,
212213
const Tensor& weight,
213214
const Tensor& bias,
@@ -216,9 +217,26 @@ void quantized_linear_out(
216217
const Tensor& out_multiplier,
217218
const Tensor& out_shift,
218219
int64_t out_zero_point,
219-
__ET_UNUSED const optional<Tensor>& offset,
220+
const optional<Tensor>& offset,
220221
Tensor& out) {
221-
if (out.scalar_type() == executorch::aten::ScalarType::Byte) {
222+
if (out.scalar_type() == ::executorch::aten::ScalarType::Short &&
223+
in.scalar_type() == ::executorch::aten::ScalarType::Short &&
224+
weight.scalar_type() == ::executorch::aten::ScalarType::Char) {
225+
::impl::generic::native::quantized_linear_out(
226+
ctx,
227+
in,
228+
weight,
229+
bias,
230+
in_zero_point,
231+
weight_zero_point,
232+
out_multiplier,
233+
out_shift,
234+
out_zero_point,
235+
offset,
236+
out);
237+
}
238+
239+
else if (out.scalar_type() == executorch::aten::ScalarType::Byte) {
222240
_quantized_linear_asym8u(
223241
in,
224242
weight,
@@ -260,7 +278,24 @@ void quantized_linear_per_tensor_out(
260278
int64_t out_zero_point,
261279
__ET_UNUSED const optional<Tensor>& offset,
262280
Tensor& out) {
263-
if (out.scalar_type() == executorch::aten::ScalarType::Byte) {
281+
if (out.scalar_type() == ::executorch::aten::ScalarType::Short &&
282+
in.scalar_type() == ::executorch::aten::ScalarType::Short &&
283+
weight.scalar_type() == ::executorch::aten::ScalarType::Char) {
284+
::impl::generic::native::quantized_linear_per_tensor_out(
285+
ctx,
286+
in,
287+
weight,
288+
bias,
289+
in_zero_point,
290+
weight_zero_point,
291+
out_multiplier,
292+
out_shift,
293+
out_zero_point,
294+
offset,
295+
out);
296+
}
297+
298+
else if (out.scalar_type() == executorch::aten::ScalarType::Byte) {
264299
_quantized_linear_per_tensor_asym8u(
265300
in,
266301
weight,

backends/cadence/hifi/operators/targets.bzl

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -87,7 +87,6 @@ OPERATORS = [
8787
"quantized_fully_connected_asym8sxasym8s_asym8s_per_tensor_out",
8888
"quantized_fully_connected_asym8uxasym8u_asym8u_per_tensor_out",
8989
"quantized_layer_norm",
90-
"quantized_linear_out",
9190
"quantized_linear_asym8sxasym8s_asym8s_per_tensor_out",
9291
"quantized_linear_asym8uxasym8u_asym8u_per_tensor_out",
9392
"quantized_matmul_out",
@@ -122,3 +121,7 @@ def define_common_targets():
122121
# Define build targets for all operators registered in the tables above.
123122
for op in OPERATORS:
124123
define_operator(op)
124+
125+
# quantized_linear_out and quantized_linear_per_tensor_out needs additional dependency for int16 support
126+
define_operator("quantized_linear_out", deps=["fbcode//on_device_ai/Assistant/Jarvis/min_runtime/operators:quantize_linear_out", "fbcode//on_device_ai/Assistant/Jarvis/min_runtime/operators:headers",])
127+
define_operator("quantized_linear_per_tensor_out", deps=["fbcode//on_device_ai/Assistant/Jarvis/min_runtime/operators:quantize_linear_out", "fbcode//on_device_ai/Assistant/Jarvis/min_runtime/operators:headers",])
Lines changed: 132 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,132 @@
1+
/*
2+
* Copyright (c) Meta Platforms, Inc. and affiliates.
3+
* All rights reserved.
4+
*
5+
* This source code is licensed under the BSD-style license found in the
6+
* LICENSE file in the root directory of this source tree.
7+
*/
8+
9+
#include <gtest/gtest.h>
10+
#include <sys/times.h>
11+
12+
#include <executorch/kernels/test/TestUtil.h>
13+
#include <executorch/runtime/core/error.h>
14+
#include <executorch/runtime/core/exec_aten/exec_aten.h>
15+
#include <executorch/runtime/core/exec_aten/testing_util/tensor_factory.h>
16+
#include <executorch/runtime/core/exec_aten/testing_util/tensor_util.h>
17+
#include <executorch/runtime/platform/runtime.h>
18+
19+
#include <executorch/backends/cadence/hifi/operators/operators.h>
20+
21+
namespace impl {
22+
namespace HiFi {
23+
namespace native {
24+
namespace {
25+
26+
using ::executorch::aten::Scalar;
27+
using ::executorch::aten::ScalarType;
28+
using ::executorch::aten::Tensor;
29+
using ::executorch::aten::TensorImpl;
30+
using ::executorch::runtime::Error;
31+
using ::executorch::runtime::KernelRuntimeContext;
32+
using ::executorch::runtime::runtime_init;
33+
using ::executorch::runtime::testing::TensorFactory;
34+
using std::optional;
35+
using std::string_view;
36+
37+
class HiFiQuantizedLinearTest : public OperatorTest {
38+
public:
39+
protected:
40+
void quantized_linear_out(
41+
const Tensor& input,
42+
const Tensor& weight,
43+
const Tensor& bias,
44+
int64_t in_zero_point,
45+
const Tensor& weight_zero_point,
46+
const Tensor& out_multiplier,
47+
const Tensor& out_shift,
48+
int64_t out_zero_point,
49+
const optional<Tensor>& offset,
50+
Tensor& output) {
51+
return ::impl::HiFi::native::quantized_linear_out(
52+
context_,
53+
input,
54+
weight,
55+
bias,
56+
in_zero_point,
57+
weight_zero_point,
58+
out_multiplier,
59+
out_shift,
60+
out_zero_point,
61+
offset,
62+
output);
63+
}
64+
65+
void quantized_linear_per_tensor_out(
66+
const Tensor& input,
67+
const Tensor& weight,
68+
const Tensor& bias,
69+
int64_t in_zero_point,
70+
int64_t weight_zero_point,
71+
int64_t out_multiplier,
72+
int64_t out_shift,
73+
int64_t out_zero_point,
74+
const optional<Tensor>& offset,
75+
Tensor& output) {
76+
return ::impl::HiFi::native::quantized_linear_per_tensor_out(
77+
context_,
78+
input,
79+
weight,
80+
bias,
81+
in_zero_point,
82+
weight_zero_point,
83+
out_multiplier,
84+
out_shift,
85+
out_zero_point,
86+
offset,
87+
output);
88+
}
89+
};
90+
91+
// Test quantized_linear_out with int16 activations (asym8s)
92+
TEST_F(HiFiQuantizedLinearTest, QuantizedLinearInt16Test) {
93+
TensorFactory<ScalarType::Short> tf_int16;
94+
TensorFactory<ScalarType::Int> tf_int32;
95+
TensorFactory<ScalarType::Char> tf_int8;
96+
97+
// Simple 2D case: input [2, 3] x weight [4, 3] = output [2, 4]
98+
// Values captured from e2e test with
99+
// CadenceWith16BitLinearActivationsQuantizer
100+
Tensor input =
101+
tf_int16.make({2, 3}, {-28170, -26389, -32768, -31474, -32266, -29076});
102+
Tensor weight = tf_int8.make(
103+
{4, 3}, {1, 87, -128, -114, -59, 44, -1, 127, -12, 44, -46, -29});
104+
Tensor bias = tf_int32.zeros({4});
105+
Tensor output = tf_int16.zeros({2, 4});
106+
107+
int64_t in_zero_point = -29822;
108+
Tensor weight_zero_point = tf_int32.make({1}, {2});
109+
Tensor out_multiplier = tf_int32.make({1}, {2011373824});
110+
Tensor out_shift = tf_int32.make({1}, {-8});
111+
int64_t out_zero_point = -30847;
112+
quantized_linear_out(
113+
input,
114+
weight,
115+
bias,
116+
in_zero_point,
117+
weight_zero_point,
118+
out_multiplier,
119+
out_shift,
120+
out_zero_point,
121+
std::nullopt,
122+
output);
123+
// Expected output from e2e test
124+
Tensor expected_output = tf_int16.make(
125+
{2, 4}, {-28384, -32767, -29144, -30862, -31956, -29486, -31985, -30756});
126+
EXPECT_TENSOR_CLOSE(output, expected_output);
127+
}
128+
129+
} // namespace
130+
} // namespace native
131+
} // namespace HiFi
132+
} // namespace impl

0 commit comments

Comments
 (0)