77 */
88
99#include < executorch/backends/cadence/hifi/kernels/kernels.h>
10+ #include < executorch/backends/cadence/hifi/operators/operators.h>
1011#include < executorch/runtime/kernel/kernel_includes.h>
12+ #include < xa_nnlib_kernels_api.h>
13+ #include < xtensa/tie/xt_datacache.h>
1114#include < algorithm>
1215#include < cmath>
16+ #include < optional>
1317
1418namespace cadence {
1519namespace impl {
1620namespace HiFi {
1721namespace native {
1822
19- using executorch::aten::Tensor;
20- using executorch::runtime::getLeadingDims;
21- using executorch::runtime::KernelRuntimeContext;
23+ using ::executorch::aten::optional;
24+ using ::executorch::aten::ScalarType;
25+ using ::executorch::aten::Tensor;
26+ using ::executorch::runtime::getLeadingDims;
27+ using ::executorch::runtime::KernelRuntimeContext;
2228
23- void quantized_linear_out (
24- KernelRuntimeContext& ctx,
25- const Tensor& src,
29+ void _quantized_linear_asym8u (
30+ const Tensor& in,
2631 const Tensor& weight,
2732 const Tensor& bias,
28- int64_t src_zero_point ,
33+ int64_t in_zero_point ,
2934 const Tensor& weight_zero_point,
3035 const Tensor& out_multiplier,
3136 const Tensor& out_shift,
3237 int64_t out_zero_point,
33- const executorch::aten:: optional<Tensor>& offset,
38+ __ET_UNUSED const optional<Tensor>& offset,
3439 Tensor& out) {
3540 // input comes in shape [leading_dims, in_dim]
3641 // weight comes in shape [out_dim, in_dim]
3742 // output comes in empty with shape [leading_dims, out_dim]
3843 // Perform matrix multiply (M x N) x (N x P)' => M x P
39- int64_t leading_dims = getLeadingDims (src, src .dim () - 1 );
40- int64_t out_dim = weight.size (0 ); // = out_dim
41- int64_t in_dim = weight.size (1 ); // = in_dim
44+ const int64_t leading_dims = getLeadingDims (in, in .dim () - 1 );
45+ const int64_t out_dim = weight.size (0 ); // = out_dim
46+ const int64_t in_dim = weight.size (1 ); // = in_dim
4247
43- const uint8_t * __restrict__ in_data = src .const_data_ptr <uint8_t >();
48+ const uint8_t * __restrict__ in_data = in .const_data_ptr <uint8_t >();
4449 const uint8_t * __restrict__ weight_data = weight.const_data_ptr <uint8_t >();
4550 const int32_t * __restrict__ bias_data = bias.const_data_ptr <int32_t >();
4651 uint8_t * __restrict__ out_data = out.mutable_data_ptr <uint8_t >();
4752
4853 // The nnlib kernel to compute quantized linear via matmul.
49- int32_t ret = cadence::impl::HiFi::kernels::matmul_asym8uxasym8u_asym8u (
54+ int32_t ret = xa_nn_matmul_asym8uxasym8u_asym8u (
5055 out_data, // p_out
5156 weight_data, // p_mat1,
5257 in_data, // p_mat2,
@@ -59,14 +64,238 @@ void quantized_linear_out(
5964 out_dim, // out_offset, i.e., offset of next output element written
6065 1 , // out_stride, i.e., stride to go to next output row
6166 -weight_zero_point.const_data_ptr <int32_t >()[0 ], // mat1_zero_bias
62- -src_zero_point, // mat2_zero_bias
63- out_multiplier.const_data_ptr <int32_t >(), // out_multiplier
64- out_shift.const_data_ptr <int32_t >(), // out_shift
65- out_zero_point, // out_zero_bias
66- false ); // per channel quantization
67+ -in_zero_point, // mat2_zero_bias
68+ out_multiplier.const_data_ptr <int32_t >()[0 ], // out_multiplier
69+ out_shift.const_data_ptr <int32_t >()[0 ], // out_shift
70+ out_zero_point); // out_zero_bias
6771 ET_DCHECK_MSG (ret == 0 , " HiFi quantized::linear failed" );
6872}
6973
74+ void inline _quantized_linear_asym8s (
75+ const Tensor& in,
76+ const Tensor& weight,
77+ const Tensor& bias,
78+ int64_t in_zero_point,
79+ const Tensor& weight_zero_point,
80+ const Tensor& out_multiplier,
81+ const Tensor& out_shift,
82+ int64_t out_zero_point,
83+ __ET_UNUSED const optional<Tensor>& offset,
84+ Tensor& out) {
85+ // input comes in shape [leading_dims, in_dim]
86+ // weight comes in shape [out_dim, in_dim]
87+ // output comes in empty with shape [leading_dims, out_dim]
88+ // Perform matrix multiply (M x N) x (N x P)' => M x P
89+ const int64_t leading_dims = getLeadingDims (in, in.dim () - 1 );
90+ const int64_t out_dim = weight.size (0 ); // = out_dim
91+ const int64_t in_dim = weight.size (1 ); // = in_dim
92+
93+ const int8_t * __restrict__ in_data = in.const_data_ptr <int8_t >();
94+ const int8_t * __restrict__ weight_data = weight.const_data_ptr <int8_t >();
95+ const int32_t * __restrict__ bias_data = bias.const_data_ptr <int32_t >();
96+ int8_t * __restrict__ out_data = out.mutable_data_ptr <int8_t >();
97+
98+ // The nnlib kernel to compute quantized linear via matmul.
99+ int32_t ret = xa_nn_matmul_asym8sxasym8s_asym8s (
100+ out_data, // p_out
101+ weight_data, // p_mat1,
102+ in_data, // p_mat2,
103+ bias_data, // p_bias
104+ out_dim, // rows of p_mat1
105+ in_dim, // cols of p_mat1
106+ in_dim, // row_stride of p_mat1
107+ leading_dims, // vec_count, i.e., rows of p_mat2
108+ in_dim, // vec_offset of p_mat2.
109+ out_dim, // out_offset, i.e., offset of next output element written
110+ 1 , // out_stride, i.e., stride to go to next output row
111+ -weight_zero_point.const_data_ptr <int32_t >()[0 ], // mat1_zero_bias
112+ -in_zero_point, // mat2_zero_bias
113+ out_multiplier.const_data_ptr <int32_t >()[0 ], // out_multiplier
114+ out_shift.const_data_ptr <int32_t >()[0 ], // out_shift
115+ out_zero_point); // out_zero_bias
116+ ET_DCHECK_MSG (ret == 0 , " HiFi quantized::linear failed" );
117+ }
118+
119+ void inline _quantized_linear_per_tensor_asym8u (
120+ const Tensor& in,
121+ const Tensor& weight,
122+ const Tensor& bias,
123+ int64_t in_zero_point,
124+ int64_t weight_zero_point,
125+ int64_t out_multiplier,
126+ int64_t out_shift,
127+ int64_t out_zero_point,
128+ __ET_UNUSED const optional<Tensor>& offset,
129+ Tensor& out) {
130+ // input comes in shape [leading_dims, in_dim]
131+ // weight comes in shape [out_dim, in_dim]
132+ // output comes in empty with shape [leading_dims, out_dim]
133+ // Perform matrix multiply (M x N) x (N x P)' => M x P
134+ const int64_t leading_dims = getLeadingDims (in, in.dim () - 1 );
135+ const int64_t out_dim = weight.size (0 ); // = out_dim
136+ const int64_t in_dim = weight.size (1 ); // = in_dim
137+
138+ const uint8_t * __restrict__ in_data = in.const_data_ptr <uint8_t >();
139+ const uint8_t * __restrict__ weight_data = weight.const_data_ptr <uint8_t >();
140+ const int32_t * __restrict__ bias_data = bias.const_data_ptr <int32_t >();
141+ uint8_t * __restrict__ out_data = out.mutable_data_ptr <uint8_t >();
142+
143+ const int32_t out_multipler_int32 = static_cast <int32_t >(out_multiplier);
144+ const int32_t out_shift_int32 = static_cast <int32_t >(out_shift);
145+
146+ // The nnlib kernel to compute quantized linear via matmul.
147+ const int32_t ret = xa_nn_matmul_asym8uxasym8u_asym8u (
148+ out_data, // p_out
149+ weight_data, // p_mat1,
150+ in_data, // p_mat2,
151+ bias_data, // p_bias
152+ out_dim, // rows of p_mat1
153+ in_dim, // cols of p_mat1
154+ in_dim, // row_stride of p_mat1
155+ leading_dims, // vec_count, i.e., rows of p_mat2
156+ in_dim, // vec_offset of p_mat2.
157+ out_dim, // out_offset, i.e., offset of next output element written
158+ 1 , // out_stride, i.e., stride to go to next output row
159+ -weight_zero_point, // mat1_zero_bias
160+ -in_zero_point, // mat2_zero_bias
161+ out_multipler_int32, // out_multiplier
162+ out_shift_int32, // out_shift
163+ out_zero_point); // out_zero_bias
164+ ET_DCHECK_MSG (ret == 0 , " HiFi quantized::linear_per_tensor failed" );
165+ }
166+
167+ void inline _quantized_linear_per_tensor_asym8s (
168+ const Tensor& in,
169+ const Tensor& weight,
170+ const Tensor& bias,
171+ int64_t in_zero_point,
172+ int64_t weight_zero_point,
173+ int64_t out_multiplier,
174+ int64_t out_shift,
175+ int64_t out_zero_point,
176+ __ET_UNUSED const optional<Tensor>& offset,
177+ Tensor& out) {
178+ // input comes in shape [leading_dims, in_dim]
179+ // weight comes in shape [out_dim, in_dim]
180+ // output comes in empty with shape [leading_dims, out_dim]
181+ // Perform matrix multiply (M x N) x (N x P)' => M x P
182+ const int64_t leading_dims = getLeadingDims (in, in.dim () - 1 );
183+ const int64_t out_dim = weight.size (0 ); // = out_dim
184+ const int64_t in_dim = weight.size (1 ); // = in_dim
185+
186+ const int8_t * __restrict__ in_data = in.const_data_ptr <int8_t >();
187+ const int8_t * __restrict__ weight_data = weight.const_data_ptr <int8_t >();
188+ const int32_t * __restrict__ bias_data = bias.const_data_ptr <int32_t >();
189+ int8_t * __restrict__ out_data = out.mutable_data_ptr <int8_t >();
190+
191+ const int32_t out_multipler_int32 = static_cast <int32_t >(out_multiplier);
192+ const int32_t out_shift_int32 = static_cast <int32_t >(out_shift);
193+
194+ // The nnlib kernel to compute quantized linear via matmul.
195+ const int32_t ret = xa_nn_matmul_asym8sxasym8s_asym8s (
196+ out_data, // p_out
197+ weight_data, // p_mat1,
198+ in_data, // p_mat2,
199+ bias_data, // p_bias
200+ out_dim, // rows of p_mat1
201+ in_dim, // cols of p_mat1
202+ in_dim, // row_stride of p_mat1
203+ leading_dims, // vec_count, i.e., rows of p_mat2
204+ in_dim, // vec_offset of p_mat2.
205+ out_dim, // out_offset, i.e., offset of next output element written
206+ 1 , // out_stride, i.e., stride to go to next output row
207+ -weight_zero_point, // mat1_zero_bias
208+ -in_zero_point, // mat2_zero_bias
209+ out_multipler_int32, // out_multiplier
210+ out_shift_int32, // out_shift
211+ out_zero_point); // out_zero_bias
212+ ET_DCHECK_MSG (ret == 0 , " HiFi quantized::linear_per_tensor failed" );
213+ }
214+
215+ void quantized_linear_out (
216+ __ET_UNUSED KernelRuntimeContext& ctx,
217+ const Tensor& in,
218+ const Tensor& weight,
219+ const Tensor& bias,
220+ int64_t in_zero_point,
221+ const Tensor& weight_zero_point,
222+ const Tensor& out_multiplier,
223+ const Tensor& out_shift,
224+ int64_t out_zero_point,
225+ __ET_UNUSED const optional<Tensor>& offset,
226+ Tensor& out) {
227+ if (out.scalar_type () == exec_aten::ScalarType::Byte) {
228+ _quantized_linear_asym8u (
229+ in,
230+ weight,
231+ bias,
232+ in_zero_point,
233+ weight_zero_point,
234+ out_multiplier,
235+ out_shift,
236+ out_zero_point,
237+ offset,
238+ out);
239+ } else if (out.scalar_type () == exec_aten::ScalarType::Char) {
240+ _quantized_linear_asym8s (
241+ in,
242+ weight,
243+ bias,
244+ in_zero_point,
245+ weight_zero_point,
246+ out_multiplier,
247+ out_shift,
248+ out_zero_point,
249+ offset,
250+ out);
251+ } else {
252+ ET_CHECK_MSG (
253+ false , " quantized linear only supported for uint8 and int8 dtypes" );
254+ }
255+ }
256+
257+ void quantized_linear_per_tensor_out (
258+ __ET_UNUSED KernelRuntimeContext& ctx,
259+ const Tensor& in,
260+ const Tensor& weight,
261+ const Tensor& bias,
262+ int64_t in_zero_point,
263+ int64_t weight_zero_point,
264+ int64_t out_multiplier,
265+ int64_t out_shift,
266+ int64_t out_zero_point,
267+ __ET_UNUSED const optional<Tensor>& offset,
268+ Tensor& out) {
269+ if (out.scalar_type () == exec_aten::ScalarType::Byte) {
270+ _quantized_linear_per_tensor_asym8u (
271+ in,
272+ weight,
273+ bias,
274+ in_zero_point,
275+ weight_zero_point,
276+ out_multiplier,
277+ out_shift,
278+ out_zero_point,
279+ offset,
280+ out);
281+ } else if (out.scalar_type () == exec_aten::ScalarType::Char) {
282+ _quantized_linear_per_tensor_asym8s (
283+ in,
284+ weight,
285+ bias,
286+ in_zero_point,
287+ weight_zero_point,
288+ out_multiplier,
289+ out_shift,
290+ out_zero_point,
291+ offset,
292+ out);
293+ } else {
294+ ET_CHECK_MSG (
295+ false , " quantized linear only supported for uint8 and int8 dtypes" );
296+ }
297+ }
298+
70299}; // namespace native
71300}; // namespace HiFi
72301}; // namespace impl
0 commit comments