Skip to content

Commit 9e4a0d9

Browse files
RahulC7facebook-github-bot
authored andcommitted
2/n Enable 16-bit activations and 8-bit weights in Cadence Quantizer for linear
Summary: # Context We continue from D84284794 to add support for 16-bit activations. Note that right now, all though they support 16-bit activations already, it's only if the weights are also 16-bits. To do this, we need to change the way we template some functions. # Current Behavior Right now, we're composing two macros together, the `ET_FORALL_JARVIS_QUANTIZED_TYPES_WITH_INT16` macro: https://www.internalfb.com/code/fbsource/[9e8c6d8466107f58aa3de1b9e4ec71c49d670a8f]/fbcode/on_device_ai/Assistant/Jarvis/min_runtime/operators/generic/operators.h?lines=22-25 and the function macro(`quantized_linear` chosen for example): https://www.internalfb.com/code/fbsource/[9e8c6d8466107f58aa3de1b9e4ec71c49d670a8f]/fbcode/on_device_ai/Assistant/Jarvis/min_runtime/operators/generic/quantized_linear_out.cpp?lines=30-41 so together, it just becomes a switch statement, calling the `quantized_linear` function with the correct template parameter. However, note that it assumes that both the input activations and weights are the same dtype, which is not the case. # This Diff We fix the generic implementation by allowing there to be two generics, one for the weight and one for the input activations. Reviewed By: hsharma35 Differential Revision: D86538176
1 parent aff5086 commit 9e4a0d9

File tree

3 files changed

+172
-5
lines changed

3 files changed

+172
-5
lines changed

backends/cadence/hifi/operators/op_quantized_linear_out.cpp

Lines changed: 39 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@
1414
#include <algorithm>
1515
#include <cmath>
1616
#include <optional>
17+
#include <on_device_ai/Assistant/Jarvis/min_runtime/operators/generic/operators.h>
1718

1819
namespace impl {
1920
namespace HiFi {
@@ -207,7 +208,7 @@ void inline _quantized_linear_per_tensor_asym8s(
207208
}
208209

209210
void quantized_linear_out(
210-
__ET_UNUSED KernelRuntimeContext& ctx,
211+
KernelRuntimeContext& ctx,
211212
const Tensor& in,
212213
const Tensor& weight,
213214
const Tensor& bias,
@@ -216,9 +217,26 @@ void quantized_linear_out(
216217
const Tensor& out_multiplier,
217218
const Tensor& out_shift,
218219
int64_t out_zero_point,
219-
__ET_UNUSED const optional<Tensor>& offset,
220+
const optional<Tensor>& offset,
220221
Tensor& out) {
221-
if (out.scalar_type() == executorch::aten::ScalarType::Byte) {
222+
if (out.scalar_type() == ::executorch::aten::ScalarType::Short && in.scalar_type() == ::executorch::aten::ScalarType::Short &&
223+
weight.scalar_type() == ::executorch::aten::ScalarType::Char) {
224+
::impl::generic::native::quantized_linear_out(
225+
ctx,
226+
in,
227+
weight,
228+
bias,
229+
in_zero_point,
230+
weight_zero_point,
231+
out_multiplier,
232+
out_shift,
233+
out_zero_point,
234+
offset,
235+
out
236+
);
237+
}
238+
239+
else if (out.scalar_type() == executorch::aten::ScalarType::Byte) {
222240
_quantized_linear_asym8u(
223241
in,
224242
weight,
@@ -260,7 +278,24 @@ void quantized_linear_per_tensor_out(
260278
int64_t out_zero_point,
261279
__ET_UNUSED const optional<Tensor>& offset,
262280
Tensor& out) {
263-
if (out.scalar_type() == executorch::aten::ScalarType::Byte) {
281+
if (out.scalar_type() == ::executorch::aten::ScalarType::Short && in.scalar_type() == ::executorch::aten::ScalarType::Short &&
282+
weight.scalar_type() == ::executorch::aten::ScalarType::Char) {
283+
::impl::generic::native::quantized_linear_per_tensor_out(
284+
ctx,
285+
in,
286+
weight,
287+
bias,
288+
in_zero_point,
289+
weight_zero_point,
290+
out_multiplier,
291+
out_shift,
292+
out_zero_point,
293+
offset,
294+
out
295+
);
296+
}
297+
298+
else if (out.scalar_type() == executorch::aten::ScalarType::Byte) {
264299
_quantized_linear_per_tensor_asym8u(
265300
in,
266301
weight,

backends/cadence/hifi/operators/targets.bzl

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -87,7 +87,6 @@ OPERATORS = [
8787
"quantized_fully_connected_asym8sxasym8s_asym8s_per_tensor_out",
8888
"quantized_fully_connected_asym8uxasym8u_asym8u_per_tensor_out",
8989
"quantized_layer_norm",
90-
"quantized_linear_out",
9190
"quantized_linear_asym8sxasym8s_asym8s_per_tensor_out",
9291
"quantized_linear_asym8uxasym8u_asym8u_per_tensor_out",
9392
"quantized_matmul_out",
@@ -122,3 +121,7 @@ def define_common_targets():
122121
# Define build targets for all operators registered in the tables above.
123122
for op in OPERATORS:
124123
define_operator(op)
124+
125+
# quantized_linear_out and quantized_linear_per_tensor_out needs additional dependency for int16 support
126+
define_operator("quantized_linear_out", deps=["fbcode//on_device_ai/Assistant/Jarvis/min_runtime/operators:quantize_linear_out", "fbcode//on_device_ai/Assistant/Jarvis/min_runtime/operators:headers",])
127+
define_operator("quantized_linear_per_tensor_out", deps=["fbcode//on_device_ai/Assistant/Jarvis/min_runtime/operators:quantize_linear_out", "fbcode//on_device_ai/Assistant/Jarvis/min_runtime/operators:headers",])
Lines changed: 129 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,129 @@
1+
/*
2+
* Copyright (c) Meta Platforms, Inc. and affiliates.
3+
* All rights reserved.
4+
*
5+
* This source code is licensed under the BSD-style license found in the
6+
* LICENSE file in the root directory of this source tree.
7+
*/
8+
9+
#include <gtest/gtest.h>
10+
#include <sys/times.h>
11+
12+
#include <executorch/kernels/test/TestUtil.h>
13+
#include <executorch/runtime/core/error.h>
14+
#include <executorch/runtime/core/exec_aten/exec_aten.h>
15+
#include <executorch/runtime/core/exec_aten/testing_util/tensor_factory.h>
16+
#include <executorch/runtime/core/exec_aten/testing_util/tensor_util.h>
17+
#include <executorch/runtime/platform/runtime.h>
18+
19+
#include <executorch/backends/cadence/hifi/operators/operators.h>
20+
21+
namespace impl {
22+
namespace HiFi {
23+
namespace native {
24+
namespace {
25+
26+
using ::executorch::aten::Scalar;
27+
using ::executorch::aten::ScalarType;
28+
using ::executorch::aten::Tensor;
29+
using ::executorch::aten::TensorImpl;
30+
using ::executorch::runtime::Error;
31+
using ::executorch::runtime::KernelRuntimeContext;
32+
using ::executorch::runtime::runtime_init;
33+
using ::executorch::runtime::testing::TensorFactory;
34+
using std::optional;
35+
using std::string_view;
36+
37+
class HiFiQuantizedLinearTest : public OperatorTest {
38+
public:
39+
protected:
40+
void quantized_linear_out(
41+
const Tensor& input,
42+
const Tensor& weight,
43+
const Tensor& bias,
44+
int64_t in_zero_point,
45+
const Tensor& weight_zero_point,
46+
const Tensor& out_multiplier,
47+
const Tensor& out_shift,
48+
int64_t out_zero_point,
49+
const optional<Tensor>& offset,
50+
Tensor& output) {
51+
return ::impl::HiFi::native::quantized_linear_out(
52+
context_,
53+
input,
54+
weight,
55+
bias,
56+
in_zero_point,
57+
weight_zero_point,
58+
out_multiplier,
59+
out_shift,
60+
out_zero_point,
61+
offset,
62+
output);
63+
}
64+
65+
void quantized_linear_per_tensor_out(
66+
const Tensor& input,
67+
const Tensor& weight,
68+
const Tensor& bias,
69+
int64_t in_zero_point,
70+
int64_t weight_zero_point,
71+
int64_t out_multiplier,
72+
int64_t out_shift,
73+
int64_t out_zero_point,
74+
const optional<Tensor>& offset,
75+
Tensor& output) {
76+
return ::impl::HiFi::native::quantized_linear_per_tensor_out(
77+
context_,
78+
input,
79+
weight,
80+
bias,
81+
in_zero_point,
82+
weight_zero_point,
83+
out_multiplier,
84+
out_shift,
85+
out_zero_point,
86+
offset,
87+
output);
88+
}
89+
};
90+
91+
// Test quantized_linear_out with int16 activations (asym8s)
92+
TEST_F(HiFiQuantizedLinearTest, QuantizedLinearInt16Test) {
93+
TensorFactory<ScalarType::Short> tf_int16;
94+
TensorFactory<ScalarType::Int> tf_int32;
95+
TensorFactory<ScalarType::Char> tf_int8;
96+
97+
// Simple 2D case: input [2, 3] x weight [4, 3] = output [2, 4]
98+
// Values captured from e2e test with CadenceWith16BitLinearActivationsQuantizer
99+
Tensor input = tf_int16.make({2, 3}, {-28170, -26389, -32768, -31474, -32266, -29076});
100+
Tensor weight = tf_int8.make(
101+
{4, 3}, {1, 87, -128, -114, -59, 44, -1, 127, -12, 44, -46, -29});
102+
Tensor bias = tf_int32.zeros({4});
103+
Tensor output = tf_int16.zeros({2, 4});
104+
105+
int64_t in_zero_point = -29822;
106+
Tensor weight_zero_point = tf_int32.make({1}, {2});
107+
Tensor out_multiplier = tf_int32.make({1}, {2011373824});
108+
Tensor out_shift = tf_int32.make({1}, {-8});
109+
int64_t out_zero_point = -30847;
110+
quantized_linear_out(
111+
input,
112+
weight,
113+
bias,
114+
in_zero_point,
115+
weight_zero_point,
116+
out_multiplier,
117+
out_shift,
118+
out_zero_point,
119+
std::nullopt,
120+
output);
121+
// Expected output from e2e test
122+
Tensor expected_output = tf_int16.make({2, 4}, {-28384, -32767, -29144, -30862, -31956, -29486, -31985, -30756});
123+
EXPECT_TENSOR_CLOSE(output, expected_output);
124+
}
125+
126+
} // namespace
127+
} // namespace native
128+
} // namespace HiFi
129+
} // namespace impl

0 commit comments

Comments
 (0)